import numpy as np #linear algebra
import pandas as pd # data processing,CSV file I/O(e.g pd.read_csv)
import seaborn as sns # for statistical data visualization
import plotly.express as px
import matplotlib.pyplot as plt # for data visualization
%matplotlib inline
from warnings import filterwarnings
filterwarnings("ignore")
data = pd.read_csv(r"C:\Users\laxma\Downloads\dataset.csv")
data
| Year | Month | Sector | Hydroelectric Power | Geothermal Energy | Solar Energy | Wind Energy | Wood Energy | Waste Energy | Fuel Ethanol, Excluding Denaturant | Biomass Losses and Co-products | Biomass Energy | Total Renewable Energy | Renewable Diesel Fuel | Other Biofuels | Conventional Hydroelectric Power | Biodiesel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1973 | 1 | Commerical | 0.000 | 0.000 | 0.000 | 0.000 | 0.570 | 0.000 | 0.000 | 0.000 | 0.570 | 0.570 | 0.00 | 0.000 | 0.000 | 0.000 |
| 1 | 1973 | 1 | Electric Power | 0.000 | 0.490 | 0.000 | 0.000 | 0.054 | 0.157 | 0.000 | 0.000 | 0.211 | 89.223 | 0.00 | 0.000 | 88.522 | 0.000 |
| 2 | 1973 | 1 | Industrial | 1.040 | 0.000 | 0.000 | 0.000 | 98.933 | 0.000 | 0.000 | 0.000 | 98.933 | 99.973 | 0.00 | 0.000 | 0.000 | 0.000 |
| 3 | 1973 | 1 | Residential | 0.000 | 0.000 | 0.000 | 0.000 | 30.074 | 0.000 | 0.000 | 0.000 | 0.000 | 30.074 | 0.00 | 0.000 | 0.000 | 0.000 |
| 4 | 1973 | 1 | Transportation | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.00 | 0.000 | 0.000 | 0.000 |
| ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... | ... |
| 3060 | 2024 | 1 | Commerical | 0.073 | 1.669 | 4.267 | 0.036 | 7.053 | 6.233 | 2.441 | 0.000 | 15.728 | 21.773 | 0.00 | 0.000 | 0.000 | 0.000 |
| 3061 | 2024 | 1 | Electric Power | 0.000 | 4.667 | 32.707 | 119.265 | 15.071 | 13.873 | 0.000 | 0.000 | 28.944 | 257.661 | 0.00 | 0.000 | 72.078 | 0.000 |
| 3062 | 2024 | 1 | Industrial | 0.308 | 0.356 | 0.987 | 0.035 | 104.878 | 14.171 | 1.533 | 67.742 | 188.325 | 190.011 | 0.00 | 0.000 | 0.000 | 0.000 |
| 3063 | 2024 | 1 | Residential | 0.000 | 3.354 | 14.897 | 0.000 | 34.065 | 0.000 | 0.000 | 0.000 | 0.000 | 52.316 | 0.00 | 0.000 | 0.000 | 0.000 |
| 3064 | 2024 | 1 | Transportation | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 86.098 | 0.000 | 140.188 | 0.000 | 30.78 | 3.442 | 0.000 | 19.867 |
3065 rows × 17 columns
data.head()
| Year | Month | Sector | Hydroelectric Power | Geothermal Energy | Solar Energy | Wind Energy | Wood Energy | Waste Energy | Fuel Ethanol, Excluding Denaturant | Biomass Losses and Co-products | Biomass Energy | Total Renewable Energy | Renewable Diesel Fuel | Other Biofuels | Conventional Hydroelectric Power | Biodiesel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 1973 | 1 | Commerical | 0.00 | 0.00 | 0.0 | 0.0 | 0.570 | 0.000 | 0.0 | 0.0 | 0.570 | 0.570 | 0.0 | 0.0 | 0.000 | 0.0 |
| 1 | 1973 | 1 | Electric Power | 0.00 | 0.49 | 0.0 | 0.0 | 0.054 | 0.157 | 0.0 | 0.0 | 0.211 | 89.223 | 0.0 | 0.0 | 88.522 | 0.0 |
| 2 | 1973 | 1 | Industrial | 1.04 | 0.00 | 0.0 | 0.0 | 98.933 | 0.000 | 0.0 | 0.0 | 98.933 | 99.973 | 0.0 | 0.0 | 0.000 | 0.0 |
| 3 | 1973 | 1 | Residential | 0.00 | 0.00 | 0.0 | 0.0 | 30.074 | 0.000 | 0.0 | 0.0 | 0.000 | 30.074 | 0.0 | 0.0 | 0.000 | 0.0 |
| 4 | 1973 | 1 | Transportation | 0.00 | 0.00 | 0.0 | 0.0 | 0.000 | 0.000 | 0.0 | 0.0 | 0.000 | 0.000 | 0.0 | 0.0 | 0.000 | 0.0 |
data.tail()
| Year | Month | Sector | Hydroelectric Power | Geothermal Energy | Solar Energy | Wind Energy | Wood Energy | Waste Energy | Fuel Ethanol, Excluding Denaturant | Biomass Losses and Co-products | Biomass Energy | Total Renewable Energy | Renewable Diesel Fuel | Other Biofuels | Conventional Hydroelectric Power | Biodiesel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 3060 | 2024 | 1 | Commerical | 0.073 | 1.669 | 4.267 | 0.036 | 7.053 | 6.233 | 2.441 | 0.000 | 15.728 | 21.773 | 0.00 | 0.000 | 0.000 | 0.000 |
| 3061 | 2024 | 1 | Electric Power | 0.000 | 4.667 | 32.707 | 119.265 | 15.071 | 13.873 | 0.000 | 0.000 | 28.944 | 257.661 | 0.00 | 0.000 | 72.078 | 0.000 |
| 3062 | 2024 | 1 | Industrial | 0.308 | 0.356 | 0.987 | 0.035 | 104.878 | 14.171 | 1.533 | 67.742 | 188.325 | 190.011 | 0.00 | 0.000 | 0.000 | 0.000 |
| 3063 | 2024 | 1 | Residential | 0.000 | 3.354 | 14.897 | 0.000 | 34.065 | 0.000 | 0.000 | 0.000 | 0.000 | 52.316 | 0.00 | 0.000 | 0.000 | 0.000 |
| 3064 | 2024 | 1 | Transportation | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 0.000 | 86.098 | 0.000 | 140.188 | 0.000 | 30.78 | 3.442 | 0.000 | 19.867 |
data.shape
(3065, 17)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3065 entries, 0 to 3064 Data columns (total 17 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Year 3065 non-null int64 1 Month 3065 non-null int64 2 Sector 3065 non-null object 3 Hydroelectric Power 3065 non-null float64 4 Geothermal Energy 3065 non-null float64 5 Solar Energy 3065 non-null float64 6 Wind Energy 3065 non-null float64 7 Wood Energy 3065 non-null float64 8 Waste Energy 3065 non-null float64 9 Fuel Ethanol, Excluding Denaturant 3065 non-null float64 10 Biomass Losses and Co-products 3065 non-null float64 11 Biomass Energy 3065 non-null float64 12 Total Renewable Energy 3065 non-null float64 13 Renewable Diesel Fuel 3065 non-null float64 14 Other Biofuels 3065 non-null float64 15 Conventional Hydroelectric Power 3065 non-null float64 16 Biodiesel 3065 non-null float64 dtypes: float64(14), int64(2), object(1) memory usage: 407.2+ KB
data.isnull().sum()
Year 0 Month 0 Sector 0 Hydroelectric Power 0 Geothermal Energy 0 Solar Energy 0 Wind Energy 0 Wood Energy 0 Waste Energy 0 Fuel Ethanol, Excluding Denaturant 0 Biomass Losses and Co-products 0 Biomass Energy 0 Total Renewable Energy 0 Renewable Diesel Fuel 0 Other Biofuels 0 Conventional Hydroelectric Power 0 Biodiesel 0 dtype: int64
data_filled = data.fillna(0)
print(data_filled.isnull().sum())
Year 0 Month 0 Sector 0 Hydroelectric Power 0 Geothermal Energy 0 Solar Energy 0 Wind Energy 0 Wood Energy 0 Waste Energy 0 Fuel Ethanol, Excluding Denaturant 0 Biomass Losses and Co-products 0 Biomass Energy 0 Total Renewable Energy 0 Renewable Diesel Fuel 0 Other Biofuels 0 Conventional Hydroelectric Power 0 Biodiesel 0 dtype: int64
data.duplicated().sum()
0
data['Year'].unique()
array([1973, 1974, 1975, 1976, 1977, 1978, 1979, 1980, 1981, 1982, 1983,
1984, 1985, 1986, 1987, 1988, 1989, 1990, 1991, 1992, 1993, 1994,
1995, 1996, 1997, 1998, 1999, 2000, 2001, 2002, 2003, 2004, 2005,
2006, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2014, 2015, 2016,
2017, 2018, 2019, 2020, 2021, 2022, 2023, 2024], dtype=int64)
data['Month'].unique()
array([ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12], dtype=int64)
len(data['Month'].unique())
12
data['Sector'].unique()
array(['Commerical', 'Electric Power', 'Industrial', 'Residential',
'Transportation'], dtype=object)
len(data['Sector'].unique())
5
data.drop(['Month', 'Year'],axis=1, inplace=True)
data.info()
<class 'pandas.core.frame.DataFrame'> RangeIndex: 3065 entries, 0 to 3064 Data columns (total 15 columns): # Column Non-Null Count Dtype --- ------ -------------- ----- 0 Sector 3065 non-null object 1 Hydroelectric Power 3065 non-null float64 2 Geothermal Energy 3065 non-null float64 3 Solar Energy 3065 non-null float64 4 Wind Energy 3065 non-null float64 5 Wood Energy 3065 non-null float64 6 Waste Energy 3065 non-null float64 7 Fuel Ethanol, Excluding Denaturant 3065 non-null float64 8 Biomass Losses and Co-products 3065 non-null float64 9 Biomass Energy 3065 non-null float64 10 Total Renewable Energy 3065 non-null float64 11 Renewable Diesel Fuel 3065 non-null float64 12 Other Biofuels 3065 non-null float64 13 Conventional Hydroelectric Power 3065 non-null float64 14 Biodiesel 3065 non-null float64 dtypes: float64(14), object(1) memory usage: 359.3+ KB
data.head()
| Sector | Hydroelectric Power | Geothermal Energy | Solar Energy | Wind Energy | Wood Energy | Waste Energy | Fuel Ethanol, Excluding Denaturant | Biomass Losses and Co-products | Biomass Energy | Total Renewable Energy | Renewable Diesel Fuel | Other Biofuels | Conventional Hydroelectric Power | Biodiesel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | Commerical | 0.00 | 0.00 | 0.0 | 0.0 | 0.570 | 0.000 | 0.0 | 0.0 | 0.570 | 0.570 | 0.0 | 0.0 | 0.000 | 0.0 |
| 1 | Electric Power | 0.00 | 0.49 | 0.0 | 0.0 | 0.054 | 0.157 | 0.0 | 0.0 | 0.211 | 89.223 | 0.0 | 0.0 | 88.522 | 0.0 |
| 2 | Industrial | 1.04 | 0.00 | 0.0 | 0.0 | 98.933 | 0.000 | 0.0 | 0.0 | 98.933 | 99.973 | 0.0 | 0.0 | 0.000 | 0.0 |
| 3 | Residential | 0.00 | 0.00 | 0.0 | 0.0 | 30.074 | 0.000 | 0.0 | 0.0 | 0.000 | 30.074 | 0.0 | 0.0 | 0.000 | 0.0 |
| 4 | Transportation | 0.00 | 0.00 | 0.0 | 0.0 | 0.000 | 0.000 | 0.0 | 0.0 | 0.000 | 0.000 | 0.0 | 0.0 | 0.000 | 0.0 |
data.columns
Index(['Sector', 'Hydroelectric Power', 'Geothermal Energy', 'Solar Energy',
'Wind Energy', 'Wood Energy', 'Waste Energy',
'Fuel Ethanol, Excluding Denaturant', 'Biomass Losses and Co-products',
'Biomass Energy', 'Total Renewable Energy', 'Renewable Diesel Fuel',
'Other Biofuels', 'Conventional Hydroelectric Power', 'Biodiesel'],
dtype='object')
#VISUALIZATION
fig=px.violin(data,x='Waste Energy',y='Geothermal Energy',color='Waste Energy')
fig.show()
plt.scatter(data['Geothermal Energy'],data['Solar Energy'],color='red')
plt.xticks(rotation=90)
plt.show()
sns.lineplot(x='Wind Energy', y='Wood Energy', data=data)
<AxesSubplot:xlabel='Wind Energy', ylabel='Wood Energy'>
plt.figure(figsize=(8, 4))
sns.scatterplot(data=data, x='Biomass Energy', y='Solar Energy')
plt.xlabel('Biomass Energy')
plt.ylabel('Solar Energy')
plt.show()
sns.displot(data["Renewable Diesel Fuel"])
<seaborn.axisgrid.FacetGrid at 0x2edc670b670>
sns.relplot(x='Total Renewable Energy',y='Biomass Losses and Co-products',data=data)
<seaborn.axisgrid.FacetGrid at 0x2edc6266310>
sns.countplot(x='Sector',data=data)
<AxesSubplot:xlabel='Sector', ylabel='count'>
#MODEL BUILDING
X = data
y = data['Sector']
from sklearn.preprocessing import LabelEncoder
le = LabelEncoder()
X['Sector'] = le.fit_transform(X['Sector'])
y = le.transform(y)
X.head()
| Sector | Hydroelectric Power | Geothermal Energy | Solar Energy | Wind Energy | Wood Energy | Waste Energy | Fuel Ethanol, Excluding Denaturant | Biomass Losses and Co-products | Biomass Energy | Total Renewable Energy | Renewable Diesel Fuel | Other Biofuels | Conventional Hydroelectric Power | Biodiesel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0 | 0.00 | 0.00 | 0.0 | 0.0 | 0.570 | 0.000 | 0.0 | 0.0 | 0.570 | 0.570 | 0.0 | 0.0 | 0.000 | 0.0 |
| 1 | 1 | 0.00 | 0.49 | 0.0 | 0.0 | 0.054 | 0.157 | 0.0 | 0.0 | 0.211 | 89.223 | 0.0 | 0.0 | 88.522 | 0.0 |
| 2 | 2 | 1.04 | 0.00 | 0.0 | 0.0 | 98.933 | 0.000 | 0.0 | 0.0 | 98.933 | 99.973 | 0.0 | 0.0 | 0.000 | 0.0 |
| 3 | 3 | 0.00 | 0.00 | 0.0 | 0.0 | 30.074 | 0.000 | 0.0 | 0.0 | 0.000 | 30.074 | 0.0 | 0.0 | 0.000 | 0.0 |
| 4 | 4 | 0.00 | 0.00 | 0.0 | 0.0 | 0.000 | 0.000 | 0.0 | 0.0 | 0.000 | 0.000 | 0.0 | 0.0 | 0.000 | 0.0 |
cols = X.columns
from sklearn.preprocessing import MinMaxScaler
ms = MinMaxScaler()
X = ms.fit_transform(X)
X = pd.DataFrame(X, columns=[cols])
X.head()
| Sector | Hydroelectric Power | Geothermal Energy | Solar Energy | Wind Energy | Wood Energy | Waste Energy | Fuel Ethanol, Excluding Denaturant | Biomass Losses and Co-products | Biomass Energy | Total Renewable Energy | Renewable Diesel Fuel | Other Biofuels | Conventional Hydroelectric Power | Biodiesel | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 0 | 0.00 | 0.000976 | 0.000000 | 0.0 | 0.0 | 0.003104 | 0.000000 | 0.0 | 0.0 | 0.002444 | 0.001850 | 0.0 | 0.0 | 0.00000 | 0.0 |
| 1 | 0.25 | 0.000976 | 0.082339 | 0.0 | 0.0 | 0.000294 | 0.004776 | 0.0 | 0.0 | 0.000905 | 0.289521 | 0.0 | 0.0 | 0.75368 | 0.0 |
| 2 | 0.50 | 0.508541 | 0.000000 | 0.0 | 0.0 | 0.538769 | 0.000000 | 0.0 | 0.0 | 0.424241 | 0.324403 | 0.0 | 0.0 | 0.00000 | 0.0 |
| 3 | 0.75 | 0.000976 | 0.000000 | 0.0 | 0.0 | 0.163777 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.097587 | 0.0 | 0.0 | 0.00000 | 0.0 |
| 4 | 1.00 | 0.000976 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.000000 | 0.000000 | 0.0 | 0.0 | 0.00000 | 0.0 |
from sklearn.cluster import KMeans
from sklearn.cluster import KMeans
cs = []
for i in range(1, 11):
kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0)
kmeans.fit(X)
cs.append(kmeans.inertia_)
plt.plot(range(1, 11), cs)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('CS')
plt.show()
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) ~\AppData\Local\Temp\ipykernel_20800\957846521.py in <module> 3 for i in range(1, 11): 4 kmeans = KMeans(n_clusters = i, init = 'k-means++', max_iter = 300, n_init = 10, random_state = 0) ----> 5 kmeans.fit(X) 6 cs.append(kmeans.inertia_) 7 plt.plot(range(1, 11), cs) D:\anaconda files\lib\site-packages\sklearn\cluster\_kmeans.py in fit(self, X, y, sample_weight) 1184 1185 # run a k-means once -> 1186 labels, inertia, centers, n_iter_ = kmeans_single( 1187 X, 1188 sample_weight, D:\anaconda files\lib\site-packages\sklearn\cluster\_kmeans.py in _kmeans_single_lloyd(X, sample_weight, centers_init, max_iter, verbose, x_squared_norms, tol, n_threads) 623 # Threadpoolctl context to limit the number of threads in second level of 624 # nested parallelism (i.e. BLAS) to avoid oversubsciption. --> 625 with threadpool_limits(limits=1, user_api="blas"): 626 for i in range(max_iter): 627 lloyd_iter( D:\anaconda files\lib\site-packages\sklearn\utils\fixes.py in threadpool_limits(limits, user_api) 312 return controller.limit(limits=limits, user_api=user_api) 313 else: --> 314 return threadpoolctl.threadpool_limits(limits=limits, user_api=user_api) 315 316 D:\anaconda files\lib\site-packages\threadpoolctl.py in __init__(self, limits, user_api) 169 self._check_params(limits, user_api) 170 --> 171 self._original_info = self._set_threadpool_limits() 172 173 def __enter__(self): D:\anaconda files\lib\site-packages\threadpoolctl.py in _set_threadpool_limits(self) 266 return None 267 --> 268 modules = _ThreadpoolInfo(prefixes=self._prefixes, 269 user_api=self._user_api) 270 for module in modules: D:\anaconda files\lib\site-packages\threadpoolctl.py in __init__(self, user_api, prefixes, modules) 338 339 self.modules = [] --> 340 self._load_modules() 341 self._warn_if_incompatible_openmp() 342 else: D:\anaconda files\lib\site-packages\threadpoolctl.py in _load_modules(self) 371 self._find_modules_with_dyld() 372 elif sys.platform == "win32": --> 373 self._find_modules_with_enum_process_module_ex() 374 else: 375 self._find_modules_with_dl_iterate_phdr() D:\anaconda files\lib\site-packages\threadpoolctl.py in _find_modules_with_enum_process_module_ex(self) 483 484 # Store the module if it is supported and selected --> 485 self._make_module_from_path(filepath) 486 finally: 487 kernel_32.CloseHandle(h_process) D:\anaconda files\lib\site-packages\threadpoolctl.py in _make_module_from_path(self, filepath) 513 if prefix in self.prefixes or user_api in self.user_api: 514 module_class = globals()[module_class] --> 515 module = module_class(filepath, prefix, user_api, internal_api) 516 self.modules.append(module) 517 D:\anaconda files\lib\site-packages\threadpoolctl.py in __init__(self, filepath, prefix, user_api, internal_api) 604 self.internal_api = internal_api 605 self._dynlib = ctypes.CDLL(filepath, mode=_RTLD_NOLOAD) --> 606 self.version = self.get_version() 607 self.num_threads = self.get_num_threads() 608 self._get_extra_info() D:\anaconda files\lib\site-packages\threadpoolctl.py in get_version(self) 644 lambda: None) 645 get_config.restype = ctypes.c_char_p --> 646 config = get_config().split() 647 if config[0] == b"OpenBLAS": 648 return config[1].decode("utf-8") AttributeError: 'NoneType' object has no attribute 'split'
from sklearn.cluster import KMeans
kmeans = KMeans(n_clusters= 2, random_state=0)
kmeans.fit(X)
kmeans.cluster_centers_
kmeans.inertia_
labels = kmeans.labels_
# check how many of the samples were correctly labeled
correct_labels = sum(y == labels)
print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
# k= 3
kmeans = KMeans(n_clusters=3, random_state=0)
kmeans.fit(X)
# check how many of the samples were correctly labeled
labels = kmeans.labels_
correct_labels = sum(y == labels)
print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))
# k= 5
kmeans = KMeans(n_clusters=5, random_state=0)
kmeans.fit(X)
# check how many of the samples were correctly labeled
labels = kmeans.labels_
correct_labels = sum(y == labels)
print("Result: %d out of %d samples were correctly labeled." % (correct_labels, y.size))
print('Accuracy score: {0:0.2f}'. format(correct_labels/float(y.size)))